http://arxiv.org/pdf/1402.1869v2.pdf
"A linear region of a piecewise linear function $F : R^{n_0} \to R^m$ is a maximal connected subset of the input-space $R^{n_0}$, on which $F$ is linear. For the functions that we consider, each linear region has full dimension, $n_0$."
In [1]:
import numpy as np
import numpy.random as npr
import pylab as pl
import seaborn as sns
pl.rcParams['font.family']='Serif'
%matplotlib inline
In [2]:
def F(x):
if x < 0:
return 0
elif x < 5:
return 2*x
else:
return 10
In [201]:
x = np.arange(-10,10,0.01)
y = np.array([F(x) for x in x])
pl.subplot(3,1,1)
pl.title('Raw')
pl.plot(x,y)
pl.xlim(-3,8)
pl.ylim(-1,11)
pl.ylabel(r'$y$')
pl.subplot(3,1,2)
pl.title('First derivative')
deriv = np.diff(y,1,axis=0)
pl.plot(x[1:],deriv);
pl.xlim(-3,8)
pl.ylim(np.min(deriv)-0.01,np.max(deriv)+0.01)
pl.ylabel(r'$dy/dx$')
pl.subplot(3,1,3)
pl.title('Second derivative')
deriv2 = np.diff(y,2,axis=0)
pl.plot(x[2:],deriv2);
pl.xlim(-3,8)
pl.ylim(np.min(deriv2)-0.01,np.max(deriv2)+0.01)
pl.ylabel(r'$d^2y/dx^2$')
pl.tight_layout()
pl.xlabel(r'$x$')
Out[201]:
In [11]:
relu = lambda x: x*(x>0)
sigmoid = lambda x : 1.0 / (1.0 + np.exp(-x))
class FeedforwardNet():
def __init__(self,layer_dims=[32,16,8,4,2,4,8,16,32],
activation=relu):
self.activation=activation
self.layer_dims = layer_dims
self.W = []
self.b = []
for i in range(1,len(layer_dims)):
self.W.append(npr.randn(layer_dims[i-1],layer_dims[i]))
self.b.append(npr.randn())
self.num_params = sum([np.prod(w.shape) for w in self.W]) + len(self.b)
self.bottleneck = np.argmin(np.array(layer_dims))
def mats_to_vec(self,W,b):
w_vecs = []
for w in W:
w_vecs.append(np.reshape(w,np.prod(w.shape)))
w_vecs.append(np.array(b))
return np.hstack(w_vecs)
def vec_to_mats(self,w_vecs):
ind = 0
W = []
for i in range(len(self.W)):
size = np.prod(self.W[i].shape)
W.append(np.reshape(w_vecs[ind:ind+size],self.W[i].shape))
ind += size
bias = w_vecs[ind:]
return W,bias
def transform(self,X,end_layer=None):
if end_layer==None:
end_layer = len(self.layer_dims)-1
def transform_one(x):
L = x
for i in range(end_layer):
L = self.activation(L.dot(self.W[i])+self.b[i])
return L
if len(X.shape) > 1:
y = np.zeros((len(X),self.layer_dims[end_layer]))
for i,x in enumerate(X):
y[i] = transform_one(x)
else:
y = transform_one(X)
return y
In [425]:
size=100
inputs = npr.rand(100000,2)*size-(size/2)
In [447]:
layer_sizes = [2,10,10,2]
npr.seed(10)
sigmoid_net = FeedforwardNet(layer_sizes,sigmoid)
outputs = sigmoid_net.transform(inputs)
pl.title('Sigmoid')
pl.scatter(inputs[:,0],inputs[:,1],c=outputs[:,0],linewidths=0,cmap='rainbow')
pl.xlim(-(size/2),size/2)
pl.ylim(-(size/2),size/2)
pl.figure()
pl.scatter(outputs[:,0],outputs[:,1],c=inputs[:,0]*inputs[:,1],linewidths=0,cmap='rainbow')
pl.figure()
npr.seed(10)
relu_net = FeedforwardNet(layer_sizes,relu)
outputs = relu_net.transform(inputs)
pl.title('ReLU')
pl.scatter(inputs[:,0],inputs[:,1],c=outputs[:,0],linewidths=0,cmap='rainbow')
pl.xlim(-(size/2),size/2)
pl.ylim(-(size/2),size/2)
pl.figure()
pl.scatter(outputs[:,0],outputs[:,1],c=inputs[:,0]*inputs[:,1],linewidths=0,cmap='rainbow')
Out[447]:
In [557]:
size=100
inputs = npr.rand(100000,2)*size-(size/2)
layer_sizes = [2,10,10,2]
npr.seed(10)
sigmoid_net = FeedforwardNet(layer_sizes,sigmoid)
outputs = sigmoid_net.transform(inputs)
perturbation_0 = np.zeros(inputs.shape)
perturbation_0[:,0] = np.ones(len(inputs))*dx
outputs_0 = sigmoid_net.transform(inputs + perturbation_0)
perturbation_1 = np.zeros(inputs.shape)
perturbation_1[:,1] = np.ones(len(inputs))*dx
outputs_1 = sigmoid_net.transform(inputs + perturbation_1)
gradient_mag = np.abs(outputs_0[:,0]-outputs[:,0]) + np.abs(outputs_1[:,0]-outputs[:,0])
pl.title('Sigmoid')
c = np.diff(outputs[:,0],1,axis=0)
pl.scatter(inputs[:,0],inputs[:,1],c=gradient_mag,linewidths=0,cmap='rainbow')
pl.xlim(-(size/2),size/2)
pl.ylim(-(size/2),size/2)
pl.figure()
npr.seed(10)
relu_net = FeedforwardNet(layer_sizes,relu)
outputs = relu_net.transform(inputs)
perturbation_0 = np.zeros(inputs.shape)
perturbation_0[:,0] = np.ones(len(inputs))*dx
outputs_0 = relu_net.transform(inputs + perturbation_0)
perturbation_1 = np.zeros(inputs.shape)
perturbation_1[:,1] = np.ones(len(inputs))*dx
outputs_1 = relu_net.transform(inputs + perturbation_1)
gradient_mag = np.abs(outputs_0[:,0]-outputs[:,0]) + np.abs(outputs_1[:,0]-outputs[:,0])
pl.title('ReLU')
pl.scatter(inputs[:,0],inputs[:,1],c=gradient_mag,linewidths=0,cmap='rainbow')
pl.xlim(-(size/2),size/2)
pl.ylim(-(size/2),size/2)
start_point = np.array((0,10))
direction = np.array((1,0))
dx=0.01
lambdas = np.arange(-size,size,dx)
points = np.array([start_point+lam*direction for lam in lambdas])
In [519]:
pl.title('ReLU')
pl.scatter(inputs[:,0],inputs[:,1],c=gradient_mag,linewidths=0,cmap='rainbow')
pl.xlim(-(size/2),size/2)
pl.ylim(-(size/2),size/2)
start_point = np.array((40,0))
direction = np.ones(2)/np.sqrt(2)
#direction[0] *= -1
dx=0.01
lambdas = np.arange(-size,size,dx)
points = np.array([start_point+lam*direction for lam in lambdas])
pl.plot(points[:,0],points[:,1],color='white',linestyle='--')
Out[519]:
In [448]:
def plot_derivs(lambdas,outs):
pl.plot(lambdas,outs);
pl.title('Raw output')
pl.ylabel(r'$y(\lambda)$')
pl.xlabel(r'$\lambda$')
pl.figure()
pl.plot(lambdas[1:],np.diff(outs,1,axis=0));
pl.title('First derivative')
pl.ylabel(r'$\frac{\partial y}{\partial \lambda}$')
pl.xlabel(r'$\lambda$')
pl.figure()
pl.plot(lambdas[2:],np.diff(outs,2,axis=0));
pl.title('Second derivative')
pl.ylabel(r'$\frac{\partial^2 y}{\partial \lambda^2}$')
pl.xlabel(r'$\lambda$')
In [520]:
print('ReLU')
outs = relu_net.transform(points)
plot_derivs(lambdas,outs)
In [521]:
print('Sigmoid')
outs = sigmoid_net.transform(points)
plot_derivs(lambdas,outs)
In [368]:
npr.seed(1)
net = FeedforwardNet([32,64,64,64],relu)
In [369]:
# start at a point in input space and project along a random direction
#direction = npr.randn(32)
direction = np.zeros(32)
direction[0] = 1.0
#start_point = npr.randn(32)
start_point = np.ones(32)
size=100
dx=0.01
lambdas = np.arange(-size,size,dx)
points = np.array([start_point+lam*direction for lam in lambdas])
In [370]:
# see what happens to network output
outs = net.transform(points)
outs.shape
Out[370]:
In [371]:
np.diff(outs,axis=0).shape
Out[371]:
In [372]:
outs.shape,np.min(outs,0).shape
Out[372]:
In [430]:
plot_derivs(lambdas,outs)
In [374]:
n=7
pl.plot(lambdas[n:],np.diff(outs,n,axis=0));
pl.title('Second derivative')
pl.ylabel(r'$\frac{\partial^n y}{\partial \lambda^n}$')
pl.xlabel(r'$\lambda$')
Out[374]:
In [375]:
n2 = len(outs)-2
pl.scatter(lambdas[2:],(np.sum(np.isclose(np.diff(outs,2,axis=0),0),1)!=outs.shape[1])+npr.randn(n2)*0.03,
alpha=0.5,linewidths=0)
pl.ylim(0.5,1.5)
Out[375]:
In [376]:
discontinuities = lambdas[2:][(np.sum(np.isclose(np.diff(outs,2,axis=0),0),1)!=outs.shape[1])]
In [377]:
sns.kdeplot(discontinuities)
pl.title('Discontinuities around (1,1,1,...,1,1,1)')
pl.xlabel(r'$\lambda$')
pl.ylabel('Density of discontinuities')
pl.xlim(-size,size)
Out[377]:
In [365]:
start_point=np.ones(32)
num_discontinuities = []
for i in range(32):
direction = np.zeros(32)
direction[i] = 1.0
size=100
dx=0.01
lambdas = np.arange(-size,size,dx)
points = np.array([start_point+lam*direction for lam in lambdas])
outs = net.transform(points)
discontinuities = lambdas[2:][(np.sum(np.isclose(np.diff(outs,2,axis=0),0),1)!=outs.shape[1])]
num_discontinuities.append(len(discontinuities))
sns.kdeplot(discontinuities)
print(np.mean(num_discontinuities),np.std(num_discontinuities))
In [366]:
def discontinuity_density(start_point,
num_directions=32,
size=100,dx=0.01):
num_discontinuities = []
for i in range(num_directions):
direction = npr.randn(len(start_point))
direction /= np.linalg.norm(direction)
lambdas = np.arange(-size,size,dx)
points = np.array([start_point+lam*direction for lam in lambdas])
outs = net.transform(points)
discontinuities = lambdas[2:][(np.sum(np.isclose(np.diff(outs,2,axis=0),0),1)!=outs.shape[1])]
discont_points = np.array([start_point+lam*direction for lam in discontinuities])
num_discontinuities.append(len(discontinuities))
sns.kdeplot(discontinuities)
pl.xlim(-size,size)
print(np.mean(num_discontinuities),np.std(num_discontinuities))
In [378]:
discontinuity_density(np.zeros(32))
In [337]:
discontinuity_density(np.ones(32))
In [341]:
discontinuity_density(np.ones(32)*3)
In [338]:
discontinuity_density(np.ones(32)*10)
In [339]:
discontinuity_density(np.ones(32)*100)
In [340]:
discontinuity_density(np.ones(32)*500)
In [472]:
from time import time
In [547]:
# hunting for discontinuities
# pick a random point, pick a random direction, store the locations of all discontinuities encountered
size=50
dx=0.01
all_discontinuities = []
num_points=1000
input_dim=2
initial_points = npr.randn(num_points,input_dim)*10
t = time()
all_points = []
for i in range(num_points):
initial_point = initial_points[i]
direction = npr.randn(len(start_point))
direction /= np.linalg.norm(direction)
lambdas = np.arange(-size,size,dx)
points = np.array([initial_point+lam*direction for lam in lambdas])
all_points.append(points)
outs = net.transform(points)
discontinuities = lambdas[2:][(np.sum(np.isclose(np.diff(outs,2,axis=0),0),1)!=outs.shape[1])]
discont_points = np.array([initial_point+lam*direction for lam in discontinuities])
all_discontinuities.append(discont_points)
if i % 100 == 0:
print(i)
print(time() - t)
In [548]:
all_discontinuities = np.vstack(all_discontinuities)
all_points = np.vstack(all_points)
In [549]:
norms = np.array([np.linalg.norm(d) for d in all_discontinuities])
In [550]:
pl.hist(norms,bins=50);
In [555]:
sns.kdeplot(norms,label='discontinuities')
sns.kdeplot(np.array([np.linalg.norm(p) for p in initial_points]),label='input centerpoints')
sns.kdeplot(np.array([np.linalg.norm(p) for p in all_points]),label='all tested points')
pl.title('Norms of discontinuous points')
pl.xlabel(r'$\ell_2$-norm')
pl.ylabel('Probability density')
pl.xlim(0,70)
Out[555]:
In [620]:
axes = []
ax = sns.kdeplot(all_discontinuities[:,0],label='discontinuities (x)')
axes.append(ax)
ax = sns.kdeplot(all_discontinuities[:,1],label='discontinuities (y)')
axes.append(ax)
ax = sns.kdeplot(initial_points[:,0],label='input centerpoints (x)')
axes.append(ax)
ax = sns.kdeplot(initial_points[:,1],label='input centerpoints (y)')
axes.append(ax)
ax = sns.kdeplot(all_points[:,0],label='all testpoints (x)')
axes.append(ax)
ax = sns.kdeplot(all_points[:,1],label='all testpoints (y)')
axes.append(ax)
pl.title('Absolute coordinate values of discontinuous points')
pl.xlabel('Value')
pl.ylabel('Probability density')
pl.xlim(-70,70)
Out[620]:
In [558]:
axes = []
ax = sns.kdeplot(all_discontinuities[:,0],label='discontinuities')
axes.append(ax)
ax = sns.kdeplot(initial_points[:,0],label='input centerpoints')
axes.append(ax)
ax = sns.kdeplot(all_points[:,0],label='all testpoints')
axes.append(ax)
pl.title('Coordinate values of discontinuous points')
pl.xlabel('Value')
pl.ylabel('Probability density')
pl.xlim(-70,70)
Out[558]:
In [586]:
l = ax.lines[2]
test_dist = l.get_xydata()
l = ax.lines[0]
res_dist = l.get_xydata()
In [587]:
res_dist.shape
Out[587]:
In [589]:
np.min(res_dist[:,0]),np.max(res_dist[:,0])
Out[589]:
In [590]:
np.min(test_dist[:,0]),np.max(test_dist[:,0])
Out[590]:
In [594]:
pl.plot(np.interp(res_dist[:,0],test_dist[:,0],test_dist[:,1])/np.interp(test_dist[:,0],test_dist[:,0],test_dist[:,1]))
Out[594]:
In [588]:
pl.plot(res_dist[:,1]/test_dist[:,1])
Out[588]:
In [584]:
quotient = []
for i in range(len(test_dist)):
if test_dist[i,0]==res_dist[i,0]:
quotient.append(res_dist[i,1]/test_dist[i,1])
pl.plot(quotient)
Out[584]:
In [648]:
# hunting for discontinuities
# pick a random point, pick a random direction, store the locations of all discontinuities encountered
layer_sizes = [32,64,64,64]
npr.seed(10)
net = FeedforwardNet(layer_sizes)
size=5
dx=0.01
all_discontinuities = []
num_points=1000
input_dim=layer_sizes[0]
initial_points = npr.rand(num_points,input_dim)
t = time()
all_points = []
for i in range(num_points):
initial_point = initial_points[i]
direction = npr.randn(input_dim)
direction /= np.linalg.norm(direction)
lambdas = np.arange(-size,size,dx)
points = np.array([initial_point+lam*direction for lam in lambdas])
all_points.append(points)
outs = net.transform(points)
discontinuities = lambdas[2:][(np.sum(np.isclose(np.diff(outs,2,axis=0),0),1)!=outs.shape[1])]
discont_points = np.array([initial_point+lam*direction for lam in discontinuities])
all_discontinuities.append(discont_points)
if i % 100 == 0:
print(i)
print(time() - t)
In [649]:
all_discontinuities = np.vstack(all_discontinuities)
all_points = np.vstack(all_points)
norms = np.array([np.linalg.norm(d) for d in all_discontinuities])
In [650]:
sns.kdeplot(norms,label='discontinuities')
#sns.kdeplot(np.array([np.linalg.norm(p) for p in initial_points]),label='input centerpoints')
sns.kdeplot(np.array([np.linalg.norm(p) for p in all_points]),label='all tested points')
pl.title('Norms of discontinuous points')
pl.xlabel(r'$\ell_2$-norm')
pl.ylabel('Probability density')
#pl.xlim(0,70)
Out[650]:
In [633]:
sns.kdeplot(np.array([np.linalg.norm(p) for p in npr.rand(1000,32)]))
Out[633]:
In [603]:
pl.plot(np.interp(np.arange(-10,10,0.1),npr.randn(10)*5,npr.randn(10)))
Out[603]:
In [616]:
npr.seed(0)
layer_one = (npr.randn(100)*5,npr.randn(100))
layer_two = (npr.randn(10)*5,npr.randn(10))
In [617]:
one_out = np.interp(np.arange(-10,10,0.1),*layer_one)
two_out = np.interp(np.arange(-10,10,0.1),*layer_two)
one_two_out = np.interp(one_out,*layer_two)
In [618]:
pl.plot(one_out)
pl.title('Layer 1')
pl.figure()
pl.plot(two_out)
pl.figure()
pl.plot(one_two_out)
Out[618]:
In [ ]: